/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

/*   implementation of the own backend to use UdmSearch without third-party
 *   libraries
 */

#include "udm_config.h"

#ifdef HAVE_FILES
#include <stdio.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#include "udm_common.h"
#include "udm_spell.h"
#include "udm_db.h"
#include "udm_unicode.h"
#include "udm_robots.h"
#include "udm_mutex.h"
#include "udm_utils.h"
#include "udm_log.h"
#include "udm_conf.h"
#include "udm_crc32.h"
#include "udm_cache.h"
#include "udm_server.h"
#include "udm_xmalloc.h"
#include "udm_searchtool.h"
#include "udm_boolean.h"
#include "udm_stopwords.h"
#include "udm_doc.h"
#include "udm_result.h"
#include "udm_vars.h"
#include "udm_url.h"
#include "udm_db_int.h"
#include "udm_searchcache.h"




/*************  Words ****************************/

typedef struct {
	int	url_id;
	uint4	coord;
	int	wrd_id;
} UDM_CRCWORD;

	
static int UdmConnectFiles(UDM_DB *db){
	const char	*fmode;
	int		flags;
	int		perm=0;
	
	if(db->connected)return UDM_OK;
		
	switch(db->open_mode){
		case UDM_OPEN_MODE_READ:
			fmode="r";
			flags=O_RDONLY|UDM_BINARY;
			break;
		case UDM_OPEN_MODE_WRITE:
		default:
			fmode="w";
			flags=O_WRONLY|O_CREAT|O_TRUNC|UDM_BINARY;
			perm=UDM_IWRITE;
	}
	
	if(db->DBMode==UDM_DBMODE_SINGLE_CRC){
		char fname[1024]="";
		snprintf(fname,sizeof(fname)-1,"%s%s",db->addr.path,"dict");
		db->crcdict[0]=open(fname,flags,perm);
		if(db->crcdict[0]<0){
			sprintf(db->errstr,"Can't open dict file '%s' (%s)", fname, strerror(errno));
			db->errcode=1;
			return UDM_ERROR;
		}
		sprintf(fname,"%s%s",db->addr.path,"url.txt");
		db->url=fopen(fname,fmode);
		if(!db->url){
			sprintf(db->errstr,"Can't open URL file '%s' (%s)", fname, strerror(errno));
			db->errcode=1;
			return UDM_ERROR;
		}
	}else
	if(db->DBMode==UDM_DBMODE_MULTI_CRC){
		int fnum;
		char fname[1024];
		for(fnum=1;fnum<MAXMULTI;fnum++){	
			snprintf(fname,sizeof(fname)-1,"%sdict%02d",db->addr.path,fnum);
			db->crcdict[fnum]=open(fname,flags,perm);
			if(db->crcdict[fnum]<0){
				sprintf(db->errstr,"Can't open dict file '%s' (%s)", fname, strerror(errno));
				db->errcode=1;
				return UDM_ERROR;
			}
		}
		sprintf(fname,"%s%s",db->addr.path,"url.txt");
		db->url=fopen(fname,fmode);
		if(!db->url){
			sprintf(db->errstr,"Can't open URL file '%s' (%s)", fname, strerror(errno));
			db->errcode=1;
			return UDM_ERROR;
		}
	}else{
		char fname[1024];
		snprintf(fname,sizeof(fname)-1,"%s%s",db->addr.path,"dict.txt");
		db->dict=fopen(fname,fmode);
		if(!db->dict){
			sprintf(db->errstr,"Can't open dict file '%s' (%s)", fname, strerror(errno));
			db->errcode=1;
			return UDM_ERROR;
		}
		sprintf(fname,"%s%s",db->addr.path,"url.txt");
		db->url=fopen(fname,fmode);
		if(!db->url){
			sprintf(db->errstr,"Can't open URL file '%s' (%s)", fname, strerror(errno));
			db->errcode=1;
			return UDM_ERROR;
		}
	}
	db->connected=1;
	return UDM_OK;
}


static int UdmDeleteWordFromURL(UDM_AGENT* Indexer,UDM_DOCUMENT * Doc,UDM_DB *db){
	int	url_id=UdmVarListFindInt(&Doc->Sections,"ID",0);
	if(db->DBMode==UDM_DBMODE_CACHE){
		int i;
		i=UdmDeleteURLFromCache(Indexer,url_id,db);
		return(i);
	}
	return(UDM_OK);
}

static int UdmStoreWords(UDM_AGENT * Indexer, UDM_DOCUMENT *Doc,UDM_DB *db){
	int	url_id=UdmVarListFindInt(&Doc->Sections,"ID",0);
	
	if(db->DBMode==UDM_DBMODE_SINGLE_CRC){
		int fd,i;
		size_t bytes;
		UDM_CRCWORD * cw;
		if(Doc->Words.nwords){
			fd=db->crcdict[0];
			bytes=sizeof(UDM_CRCWORD)*Doc->Words.nwords;
			cw=(UDM_CRCWORD*)UdmXmalloc(bytes);
			for(i=0;i<Doc->Words.nwords;i++){
				if(Doc->Words.Word[i].coord){
					cw[i].url_id=url_id;
					cw[i].coord=Doc->Words.Word[i].coord;
					cw[i].wrd_id=UdmStrCRC32(Doc->Words.Word[i].word);
				}
			}
			if((bytes!=write(fd,(void *)cw,bytes))){
				sprintf(db->errstr,"Can't write to dict file (%s)", strerror(errno));
				db->errcode=1;
				return(UDM_ERROR);
			}
			free(cw);
		}
	}else
	if(db->DBMode==UDM_DBMODE_MULTI_CRC){
		int fd,i,j,len;
		size_t bytes;
		UDM_CRCWORD * cw;
		if(Doc->Words.nwords){
			bytes=sizeof(UDM_CRCWORD)*Doc->Words.nwords;
			cw=(UDM_CRCWORD*)UdmXmalloc(bytes);
			for(len=1;len<MAXMULTI;len++){
				j=0;
				for(i=0;i<Doc->Words.nwords;i++)
				if((Doc->Words.Word[i].coord)&&len==strlen(Doc->Words.Word[i].word)){
					cw[j].url_id=url_id;
					cw[j].coord=Doc->Words.Word[i].coord;
					cw[j].wrd_id=UdmStrCRC32(Doc->Words.Word[i].word);
					j++;
				}
				fd=((UDM_DB*)(db))->crcdict[len];
				bytes=sizeof(UDM_CRCWORD)*j;
				if((bytes!=write(fd,(void *)cw,bytes))){
					sprintf(db->errstr,"Can't write to dict file (%s)", strerror(errno));
					db->errcode=1;
					return(UDM_ERROR);
				}
			}
			free(cw);
		}
	}else
	if(db->DBMode==UDM_DBMODE_CACHE){
		int res;
		res=UdmStoreWordsCache(Indexer,Doc,db);
	}else{
		FILE *f;
		int i;
		f=db->dict;
		for(i=0;i<Doc->Words.nwords;i++){
			if(Doc->Words.Word[i].coord){
				fprintf(f,"%d\t%d\t%s\n",
					url_id,
					Doc->Words.Word[i].coord,
					Doc->Words.Word[i].word);
			}
		}
	}
	return(UDM_OK);
}

/************ URLs stuff *******************************/


static int UdmAddURL(UDM_AGENT* Indexer,UDM_DOCUMENT * Doc,UDM_DB *db){
	UDM_DOCUMENT	*D;
	
	Indexer->Conf->Targets.Doc=(UDM_DOCUMENT *)realloc(Indexer->Conf->Targets.Doc,
		sizeof(UDM_DOCUMENT)*(Indexer->Conf->Targets.num_rows+1));
	D=&Indexer->Conf->Targets.Doc[Indexer->Conf->Targets.num_rows];
	UdmDocInit(D);
	UdmVarListAddInt(&D->Sections,"ID",(int)Indexer->Conf->Targets.num_rows+1);
	UdmVarListAddInt(&D->Sections,"Hops",UdmVarListFindInt(&Doc->Sections,"Hops",0));
	UdmVarListAddInt(&D->Sections,"Referrer-ID",UdmVarListFindInt(&Doc->Sections,"Referrer-ID",0));
	UdmVarListAddStr(&D->Sections,"URL",UdmVarListFindStr(&Doc->Sections,"URL",""));
	Indexer->Conf->Targets.num_rows++;
	return(UDM_OK);
}

static int UdmLongUpdateURL(UDM_AGENT* Indexer,UDM_DOCUMENT * Doc,UDM_DB *db){
	int	i;
	FILE	*f;
	int	url_id=UdmVarListFindInt(&Doc->Sections,"ID",0);
	size_t	maxlen=10*1024;
	char	*textbuf=malloc(maxlen);
	
	f=db->url;
	
	for(i=0;i<Indexer->Conf->Indexed.num_rows;i++){
		int	url_id1=UdmVarListFindInt(&Indexer->Conf->Indexed.Doc[i].Sections,"ID",0);
		
		if(url_id1==url_id){
			UdmDocToTextBuf(Doc,textbuf,maxlen);
			fprintf(f,"%s\n",textbuf);
			break;
		}
	}
	free(textbuf);
	return(UDM_OK);
}



/**************** search stuff *******************/

static int UdmApplyURLLimits(UDM_AGENT *query,UDM_RESULT *Res,UDM_DB *db){
	char		str[4*1024];
	size_t		i;
	size_t		skip=0;
	
	while(fgets(str,sizeof(str),db->url)){
		UDM_DOCUMENT	Doc;
		
		UdmDocInit(&Doc);
		UdmDocFromTextBuf(&Doc,str);
		
		for(i=0;i<Res->CoordList.ncoords-skip;i++){
			UDM_DOCUMENT	*D=&Res->Doc[i];
			const char	*strlim;
			const char	*strval;
			int		matched=1;
			int		Doc_url_id=UdmVarListFindInt(&Doc.Sections,"ID",0);
			int		D_url_id=UdmVarListFindInt(&D->Sections,"ID",0);
			
			if(Doc_url_id!=D_url_id)continue;
			
			/* Check "ul" subsection match  */
			strlim=UdmVarListFindStr(&query->Conf->Vars,"ul",NULL);
			strval=UdmVarListFindStr(&D->Sections,"URL",NULL);
			if(strval&&strlim)matched=(strstr(strval,strlim)!=NULL);
			
			/* Check "tag" subsection match */
			strlim=UdmVarListFindStr(&query->Conf->Vars,"t",NULL);
			strval=UdmVarListFindStr(&D->Sections,"Tag",NULL);
			if(strval&&matched&&strlim)matched=(!strcmp(strval,strlim));
			
			if(!matched){
				/* Skip this URL */
				memmove(D,D+1,(Res->CoordList.ncoords-i-1)*sizeof(UDM_DOCUMENT));
				skip++;
			}else{
				UdmDocFromTextBuf(D,str);
			}
		}
		UdmDocFree(&Doc);
	}
	Res->CoordList.ncoords-=skip;
	Res->total_found=Res->CoordList.ncoords;
	return UDM_OK;
}


static int UdmFindWords(UDM_AGENT * query, UDM_RESULT *Res,UDM_DB *db){
	int		i,j,url_id;
	UDM_URL_CRD	*wrd=Res->CoordList.Coords;
	char		str[4048];
	int		wf[256];
	int		word_match=UdmMatchMode(UdmVarListFindStr(&query->Conf->Vars,"wm","wrd"));
	size_t		topcount;
	int		page_number = UdmVarListFindInt(&query->Conf->Vars,"np",0);
	int		page_size   = UdmVarListFindInt(&query->Conf->Vars,"ps",20);
	
	if((!db->connected)&&(UDM_OK!=UdmConnectFiles(db)))
		return NULL;
	
	UdmWeightFactorsInit(UdmVarListFindStr(&query->Conf->Vars,"wf",""),wf);
	
	/* Now find each word */
	if(db->DBMode==UDM_DBMODE_SINGLE_CRC){
		UDM_CRCWORD cw[256];
		int bytes, wnum;
		while((bytes=read(db->crcdict[0],&cw,sizeof(cw)))){
			wnum=bytes/sizeof(UDM_CRCWORD);
			for(j=0;j<wnum;j++){
				for(i=0;i<Res->WWList.nwords;i++){
				  if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_STOP) continue;
					if(Res->WWList.Word[i].crcword==cw[j].wrd_id){
						uint4 section;
						uint4 coord;
						uint4 weight;
						
						coord=cw[j].coord;
						section=UDM_WRDSEC(coord);
						weight=wf[section];
						
						if(weight){
							wrd=(UDM_URL_CRD*)UdmXrealloc(wrd,(Res->CoordList.ncoords+1)*sizeof(UDM_URL_CRD));
							coord=coord&0xFFFF0000;
							wrd[Res->CoordList.ncoords].url_id=cw[j].url_id;
							wrd[Res->CoordList.ncoords].coord=coord+(weight<<8)+Res->WWList.Word[i].order;
							Res->CoordList.ncoords++;
						}
					}
				}
			}
		}
	}else
	if(db->DBMode==UDM_DBMODE_MULTI_CRC){
		UDM_CRCWORD cw[256];
		int bytes,len,fd,wnum;

		for(i=0;i<Res->WWList.nwords;i++){
		  if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_STOP) continue;
			len=strlen(Res->WWList.Word[i].word);
			if(len>=MAXMULTI)len=MAXMULTI-1;
			fd=db->crcdict[len];
			lseek(fd,(off_t)0,SEEK_SET);
			while((bytes=read(fd,&cw,sizeof(cw)))){
				wnum=bytes/sizeof(UDM_CRCWORD);
				for(j=0;j<wnum;j++){
					if(Res->WWList.Word[i].crcword==cw[j].wrd_id){
						uint4 section;
						uint4 coord;
						uint4 weight;
						
						coord=cw[j].coord;
						section=UDM_WRDSEC(coord);
						weight=wf[section];
						
						if(weight){
							wrd=(UDM_URL_CRD*)UdmXrealloc(wrd,(Res->CoordList.ncoords+1)*sizeof(UDM_URL_CRD));
							coord=coord&0xFFFF0000;
							wrd[Res->CoordList.ncoords].url_id=cw[j].url_id;
							wrd[Res->CoordList.ncoords].coord=coord+(weight<<8)+Res->WWList.Word[i].order;
							Res->CoordList.ncoords++;
						}
					}
				}
			}
		}
	}else{
		/* UDM_DBMODE_SINGLE */
		while(fgets(str,sizeof(str),db->dict)){
			char * w, * lasttok=NULL;
			uint4 coord;
						
			w=strtok_r(str,"\t",&lasttok);
			if(!w)continue;url_id=atoi(w);
			w=strtok_r(NULL,"\t",&lasttok);
			if(!w)continue;coord=atoi(w);
			w=strtok_r(NULL,"\t\n",&lasttok);
			if(!w)continue;
			
			for(i=0;i<Res->WWList.nwords;i++){
				int matches=0;
				
				if (Res->WWList.Word[i].origin == UDM_WORD_ORIGIN_STOP) continue;
				switch(word_match){
					case UDM_MATCH_BEGIN:
						matches=!strncmp(w,Res->WWList.Word[i].word,strlen(Res->WWList.Word[i].word));
						break;
					case UDM_MATCH_END:
						if(strlen(Res->WWList.Word[i].word)<=strlen(w)){
							char *ending;
							ending=w+strlen(w)-strlen(Res->WWList.Word[i].word);
							matches=!strcmp(ending,Res->WWList.Word[i].word);
						}
						break;
					case UDM_MATCH_SUBSTR:
						matches=(strstr(w,Res->WWList.Word[i].word)!=NULL);
						break;
					case UDM_MATCH_FULL:
					default:
						matches=!strcmp(w,Res->WWList.Word[i].word);
						break;
				}
				
				if(matches){
					uint4 section;
					uint4 weight;
					
					section=UDM_WRDSEC(coord);
					weight=wf[section];
					
					if(weight){
						wrd=(UDM_URL_CRD*)realloc(wrd,(Res->CoordList.ncoords+1)*sizeof(UDM_URL_CRD));
						wrd[Res->CoordList.ncoords].url_id=url_id;
						wrd[Res->CoordList.ncoords].coord=coord+(weight<<8)+Res->WWList.Word[i].order;
						Res->CoordList.ncoords++;
					}
				}
			}
		}
	}
	Res->CoordList.Coords=wrd;
	
	/* Now let's sort in url_id order then group results */
	UdmSortSearchWordsByURL(Res->CoordList.Coords,Res->CoordList.ncoords);
	UdmGroupByURL(query,Res);
	
	
	/* Sort by the weight */
	topcount=page_size*(page_number+1)-1;
	if(topcount>=Res->CoordList.ncoords)topcount=Res->CoordList.ncoords-1;
	if(topcount<UDM_FAST_PRESORT_DOCS){
		UdmWrdTopSort(Res->CoordList.Coords,Res->CoordList.ncoords,topcount);
	}else{
		UdmSortSearchWordsByWeight(Res->CoordList.Coords,Res->CoordList.ncoords);
	}
	
	/* Copy SEARCHWORD to DOC structure */
	Res->Doc=(UDM_DOCUMENT*)malloc(sizeof(UDM_DOCUMENT)*(Res->CoordList.ncoords));
	for(i=0;i<Res->CoordList.ncoords;i++){
		UdmDocInit(&Res->Doc[i]);
		UdmVarListReplaceInt(&Res->Doc[i].Sections,"ID",Res->CoordList.Coords[i].url_id);
		UdmVarListReplaceInt(&Res->Doc[i].Sections,"Score",123);
	}
	UdmApplyURLLimits(query,Res,db);
	
	return UDM_OK;
}


int UdmFindFiles(UDM_AGENT * query, UDM_RESULT *Res){
	UDM_DB		*db=query->Conf->db;
	int		i,res=UDM_OK;
	int		page_number = UdmVarListFindInt(&query->Conf->Vars,"np",0);
	int		page_size   = UdmVarListFindInt(&query->Conf->Vars,"ps",20);
	UDM_CHARSET	*lcs=UdmGetCharSet(UdmVarListFindStr(&query->Conf->Vars,"LocalCharset","iso-8859-1"));
	UDM_CHARSET	*bcs=UdmGetCharSet(UdmVarListFindStr(&query->Conf->Vars,"BrowserCharset","iso-8859-1"));

	if(!lcs)lcs=UdmGetCharSet("iso-8859-1");
	if(!bcs)bcs=UdmGetCharSet("iso-8859-1");
	
	UdmPrepare(query,Res);
	
	if(db->DBMode==UDM_DBMODE_CACHE){
		UdmFindCache(query,Res);
	}else{
		UdmFindWords(query,Res,db);
	}
	
	/* Return if nothing was found */
	if(!Res->CoordList.ncoords)
		goto ret;
	
	Res->first=page_number*page_size;
	if(Res->first>Res->CoordList.ncoords)Res->first=Res->CoordList.ncoords;
	Res->num_rows=Res->CoordList.ncoords-Res->first;
	if((Res->CoordList.ncoords-Res->first)>page_size)Res->num_rows=page_size;
	
	if(Res->first>0){
		for(i=0;i<Res->first;i++){
			UdmVarListFree(&Res->Doc[i].Sections);
		}
		memmove(&(Res->Doc[0]),&(Res->Doc[Res->first]),Res->num_rows*sizeof(UDM_DOCUMENT));/* FIXME realloc required */
	}
	Res->first++;
	Res->last=Res->first+Res->num_rows-1;
	
	for(i=0;i<Res->num_rows;i++){
		UdmVarListReplaceInt(&Res->Doc[i].Sections,"Order",(int)Res->first+i);
	}
	
ret:	
	UdmResHlConvert(Res,lcs,bcs);
	strcpy(query->Conf->errstr,db->errstr);
	query->Conf->errcode=db->errcode;
	return res;
}



/***************** Extern functions ***************/

int UdmClearDBFiles(UDM_AGENT* Indexer,UDM_DB *db){
	return(UDM_OK);
}


int UdmURLActionFiles(UDM_AGENT * A, UDM_DOCUMENT * D, int cmd,UDM_DB *db){
	int	res;
	
	if((!db->connected)&&(UDM_OK!=(res=UdmConnectFiles(db))))
		return res;
	
	switch(cmd){
		case UDM_URL_ACTION_DELETE:
			res=UDM_OK;
			break;
			
		case UDM_URL_ACTION_ADD:
			res=UdmAddURL(A,D,db);
			break;
			
		case UDM_URL_ACTION_SUPDATE:
			res=UDM_OK;
			break;
			
		case UDM_URL_ACTION_LUPDATE:
			res=UdmLongUpdateURL(A,D,db);
			break;
			
		case UDM_URL_ACTION_INSWORDS:
			res=UdmStoreWords(A,D,db);
			break;
			
		case UDM_URL_ACTION_INSCWORDS:
			res=UDM_OK;
			break;
			
		case UDM_URL_ACTION_DELWORDS:
			res=UdmDeleteWordFromURL(A,D,db);
			break;
			
		case UDM_URL_ACTION_DELCWORDS:
			res=UDM_ERROR;
			break;
			
		case UDM_URL_ACTION_UPDCLONE:
			res=UDM_OK;
			break;
			
		case UDM_URL_ACTION_REGCHILD:
			res=UDM_OK;
			break;
			
		case UDM_URL_ACTION_FINDBYURL:
			res=UDM_OK;
			break;
			
		case UDM_URL_ACTION_FINDBYMSG:
			res=UDM_OK;
			break;
			
		case UDM_URL_ACTION_FINDORIG:
			res=UDM_OK;
			break;
			
		case UDM_URL_ACTION_EXPIRE:
			res=UDM_OK;
			break;
			
		case UDM_URL_ACTION_REFERERS:
			res=UDM_OK;
			break;
			
		case UDM_URL_ACTION_DOCCOUNT:
			res=UDM_OK;
			A->doccount=0;
			break;
			
		default:
			res=UDM_ERROR;
	}
	return res;
}

int UdmResActionFiles(UDM_AGENT *Agent, UDM_RESULT *Res, int cmd,UDM_DB *db){

	switch(cmd){
		case UDM_RES_ACTION_WORDS:
			return UDM_ERROR;
		case UDM_RES_ACTION_DOCINFO:
			return UDM_ERROR;
		case UDM_RES_ACTION_TARGETS:
			return UDM_OK;
		default:
			return UDM_ERROR;
	}
}

int UdmCatActionFiles(UDM_AGENT * Indexer,UDM_CATEGORY *C, int cmd,UDM_DB *db){
	sprintf(db->errstr,"Categories are not supported in built-in database");
	db->errcode=1;
	return UDM_ERROR;
}

int UdmSrvActionFiles(UDM_AGENT * Indexer, UDM_SERVERLIST *S, int cmd,UDM_DB *db){
	switch(cmd){
		case UDM_SRV_ACTION_TABLE:
		default:
			sprintf(db->errstr,"ServerTable is not supported in built-in database");
			db->errcode=1;
			return(UDM_ERROR);
	}
}


int UdmStatActionFiles(UDM_AGENT *Indexer,UDM_STATLIST *Stats,UDM_DB *db){
	bzero(Stats,sizeof(Stats[0]));
	sprintf(db->errstr,"Statistics is not supported in built-in database");
	db->errcode=1;
	return(UDM_ERROR);
}


#endif
